# clear environment
rm(list=ls())

# load packages
library(openxlsx)

# load data set with keyword counts and crime numbers
dat = openxlsx::read.xlsx("vsberichte_metadata.xlsx", sheet = 1)

# binary varibable indicating whether interior minister is from a center-right party
dat$cr_intmin <- ifelse(dat$intmin_party == "cdu" | dat$intmin_party== "csu" | dat$intmin_party=="fdp", 1, ifelse(dat$intmin_party=="pro", NA, 0))

# remove those without center-right or center-left interior minister
dat <- dat[!is.na(dat$cr_intmin),]

# looged keyword ratio
dat$rwe_lwe_kw_ratio_log <- log(dat$count_right_kw + 0.5) - log(dat$count_left_kw + 0.5)

# remove those with NA in keyword ratio
dat <- dat[!is.na(dat$rwe_lwe_kw_ratio_log),]

# logged crime ratio
dat$rwe_lwe_crime_ratio_log <- log(dat$extreme_crime_right + 0.5) - log(dat$extreme_crime_left + 0.5)

# get crime ratio on federal level as a separate variable and merge with the main data 
crime_fed <- dat[dat$jurisdiction=="bund",c("year", "extreme_crime_right", "extreme_crime_left")]
names(crime_fed) <- c("year", "extreme_crime_right_fed", "extreme_crime_left_fed")
crime_fed$rwe_lwe_crime_ratio_log_fed <- log(crime_fed$extreme_crime_right_fed + 0.5) - log(crime_fed$extreme_crime_left_fed + 0.5)
dat <- merge(dat, crime_fed, by="year", all.x=T)

# impute missing values in the logged crime ratio with value on the federal level
dat$rwe_lwe_crime_ratio_log_fedimp <- ifelse(is.na(dat$rwe_lwe_crime_ratio_log),
                                             dat$rwe_lwe_crime_ratio_log_fed,
                                             dat$rwe_lwe_crime_ratio_log)

# create bias measure (difference between logged chapter length ratio and looged crime ratio)
dat$bias_ratio_fedimp <- dat$rwe_lwe_kw_ratio_log - dat$rwe_lwe_crime_ratio_log_fedimp

# create decade indicator
dat$decade <- NA
dat$decade[dat$year<1960]<-1950
dat$decade[dat$year>=1960 & dat$year<1970]<-1960
dat$decade[dat$year>=1970 & dat$year<1980]<-1970
dat$decade[dat$year>=1980 & dat$year<1990]<-1980
dat$decade[dat$year>=1990 & dat$year<2000]<-1990
dat$decade[dat$year>=2000 & dat$year<2010]<-2000
dat$decade[dat$year>=2010 & dat$year<2020]<-2010
dat$decade[dat$year>=2020 & dat$year<2030]<-2020

## regressions models
# position
mod_1 <- lm(rwe_lwe_kw_ratio_log ~ factor(cr_intmin), data=dat)
mod_2 <- lm(rwe_lwe_kw_ratio_log ~ factor(cr_intmin) + factor(jurisdiction), data=dat)
mod_3 <- lm(rwe_lwe_kw_ratio_log ~ factor(cr_intmin) + factor(jurisdiction) + factor(decade), data=dat)
mod_4 <- lm(rwe_lwe_kw_ratio_log ~ factor(cr_intmin) + factor(jurisdiction) + factor(year), data=dat)
# bias
mod_5 <- lm(bias_ratio_fedimp ~ factor(cr_intmin), data=dat)
mod_6 <- lm(bias_ratio_fedimp ~ factor(cr_intmin) + factor(jurisdiction), data=dat)
mod_7 <- lm(bias_ratio_fedimp ~ factor(cr_intmin) + factor(jurisdiction) + factor(decade), data=dat)
mod_8 <- lm(bias_ratio_fedimp ~ factor(cr_intmin) + factor(jurisdiction) + factor(year), data=dat)
